
### Project: IADB Government Payroll Analytics - Country
### Project leader: Dr Christian Schuster
### Code author (s): Robert Lipiński
### Date last update: (run below)
file.info(rstudioapi::getActiveDocumentContext()$path)$mtime

### Script purpose: assigning unique ID for each civil servant (only personal names, without date of birth present in the raw data)

### Execution time: ~10 minutes

### Inputs: 
# 1) /data/intermediate/country_05_limpiar_puesto.[format1]
# 1) /data/intermediate/country_03_limpiar_conjunto (out).[format1]


### Outputs:
# 1) /data/intermediate/country_06_limpiar_id.[format1]

# *) /data/raw_qs/intermediate_temp/country_06_limpiar_id (temp1).[format1] [only temporary file to avoid re-running full script in case of an error
# not necessary for executing the script]
# *) data/clean/aggregates/multi_payments_certain.[format1] (list of almost certain instances of people paid >1 time a month)
# *) wdata/clean/aggregates/multi_payments_suspect.[format1] (list of less certain instances of people paid >1 time a month)


#
# SET-UP --------------------------------------------------------------------------------------------
#

rm(list=ls())

### Source the '00_global.R' script with required packages and functions
source(file.path(dirname(rstudioapi::getActiveDocumentContext()$path), '00_country_global.R'))


# library(installr)
# updateR()

# Make a copy of the file
file.copy(rstudioapi::getSourceEditorContext()$path,
          gsub('code', 'code/00_ARCHIVE', gsub('\\.R', ' - copy.R', rstudioapi::getSourceEditorContext()$path)),
          overwrite = T, copy.date = T)


# '  -------------------------------------------------------------------------------------------------------------------------
# READ DATA -------------------------------------------------------------------------------------------------------------------
# 

t0 = Sys.time() # record start time

### read the file

## compare names in the data file to the ones used in the script
col_select1 = c('row_id_org', 'id', 'anyo_mes',
                'name_full_original',  'name_full', 
                 'dataset', 'fecha_ingreso', 'fecha_termino', 'genero', 'region',
                 'tipo_estamento_comprimido', 'tipo_calificacion', 'tipo_cargo_clean', 
                 'puesto_clave', 'organismo_codigo', 'organismo_nombre_clean')


col_names = names(open_dataset(file.path(main_dir, 'data', 'intermediate', "country_05_limpiar_puesto.parquet")))


country = read_flex(file.path(main_dir, 'data', 'intermediate', "country_05_limpiar_puesto"), format = 'parquet', 
                  col_select = col_select1[col_select1 %in% col_names])

add1 = read_flex(file.path(main_dir, 'data', 'intermediate', "country_03_limpiar_conjunto (out)"), format = 'parquet', 
                 col_select = c('row_id_org', setdiff(col_select1, col_names)))




## set as DT if not already done
if(!any(grepl('data.table', class(country)))){setDT(country)}
if(!any(grepl('data.table', class(add1)))){setDT(add1)}

# add1 should have nrow>= than country and all country row_id_org should be in add1 too
dim(country)
dim(add1)
sf(country$row_id_org %in% add1$row_id_org)

### combine
setkey(country, row_id_org)
setkey(add1, row_id_org)
country = add1[country, on = 'row_id_org']



### check current unique ID counts
country$id %>% fdistinct
country$name_full %>% fdistinct
country$name_full_original %>% fdistinct

col_names %>% sort
beep()

### remove missing months and names, if there are any
country <- country[!is.na(anyo_mes), ]
country <- country[!is.na(name_full), ]


### XXX fecha_ingreso/termino (re-run in 03)------------------------------------------------------------------------------------------------------------------
# apply to date columns
country = country %>% 
  mutate(
    # date_terminate = fecha_termino, date_start = fecha_ingreso, date_publish = fecha_publicacion, # save original columns to compare if you want to check if it worked
    across(c(contains('fecha')), ~if_else(!grepl('\\d{2}', ., fixed=F), NA, convert_to_date(.))) # atcountryt only if a 2 consecutive digits present
  )


### ? assign NA for impossible year and month (xxx but double-check with Christian)  -----------------------------------------------------------------------------------------------------------------------------------
country[, `:=`(
  fecha_ingreso = fifelse(fecha_ingreso < ymd("1950-01-01") | fecha_ingreso > end_date1, as.Date(NA), fecha_ingreso),
  fecha_termino = fifelse(fecha_termino < ymd("2010-01-01") | fecha_termino > end_date1, as.Date(NA), fecha_termino)
)]

pr_isna(country$fecha_termino)
pr_isna(country$fecha_ingreso)
summary(country$fecha_ingreso)
summary(country$fecha_termino)

hist(country$fecha_ingreso, breaks = 'months')



### + lag month -------------------------------------------------------------------------------------------------------------------------------------------
country <- country[order(name_full, anyo_mes)]
country[, anyo_mes_lag := shift(anyo_mes, type = "lag"), by = name_full]
country[, anyo_mes_lead := shift(anyo_mes, type = "lead"), by = name_full]
country[, anyo_mes_diff_lead := floor(as.integer(anyo_mes_lead - anyo_mes) / 30)]
country[, anyo_mes_diff_lag := floor(as.integer(anyo_mes - anyo_mes_lag) / 30)]





#
# USE NAME as (pseudo-)ID ----------------------------------------------------------------------------------------------------------
#

### BASIC CONCEPTS:
#### If you have a guy working as a doctor in hospital in Santiago and then on a contract in a clinic in, say, Valparaiso a few days
#### a month how can you tell whether you are actually looking at the same person or two different people sharing name and rank

#### ~91.5% of month-name_full groups are uniquely identified, i.e. in any given month
### this % of staff on payroll has a unique name
# temp = country[, .N, by = .(anyo_mes, name_full)] # note: works ok, as dplyr
# pr(temp$N)


### Note (1): Doesn't seem to be unique ID per se. There is name and start date of work. 
### There should be an 'id' column based on names (used to assign gender before)

### Note (2): Jornada won't work for identification, because it's missing ~1/3 of rows +
### when they are not, the values are all over the place. Neither will 'tipo_califaccion' which
### just contains random stuff in many cases (e.g. work schedules, abbreviations)


### filter to multi-month observations --------------------------------------------------------------------

## filter...
# 1) everyone with >1 monthly entry
country_multi1 <- country[, .N, by = .(name_full, anyo_mes)][N > 1]
country_multi1 = country_multi1[country, on = .(name_full, anyo_mes), nomatch = 0] # works like dplyr inner_join


# 2) everyone with the same name, but break of at least 2 months in payroll (cannot be sure if 
# this is the same person re-joining the workforce or someone new joining)
country_multi2 = country[!is.na(anyo_mes_diff_lag) & anyo_mes_diff_lag > 2,]

dim(country_multi1)
dim(country_multi2)

# combine
country_multi_all = rbindlist(list(
  country_multi1[, .SD, .SDcols = -c("N")],
  country_multi2
))

country_multi_all = unique(country_multi_all)
rm(country_multi1, country_multi2)

### xxx do like coode checks 1|11|7|2|5 for org-region etc. to see which numbers mathc? too long?

### count unique features by name -----------------------------------------------------------------------------------------------------------------------------------------------
# count how often a name occurs in different organizations, regions, and months
country_multi_all[, `:=`(
  n_org = fdistinct(organismo_codigo),
  n_region = fdistinct(region),
  n_mes = fdistinct(anyo_mes),
  n_calificacion = fdistinct(tipo_calificacion),
  n_fecha_ingreso = fdistinct(fecha_ingreso),
  n_fecha_termino = fdistinct(fecha_termino),
  n_cargo = fdistinct(tipo_cargo_clean)
  # n_cargo2 = fdistinct(tipo_cargo_clean, na.rm=F)
), by = name_full]



### (temp)save ----------------------------------------------------------------------------------------------------------------------------------
write_flex(country_multi_all, file.path('data','intermediate_temp', 'country_06_limpiar_id (temp1)'), format=format1)


### NOTE: problem - majority of people don't have people don't have full set of months (72)

temp = country[, .(n=fdistinct(anyo_mes)), by = name_full] # check how many names are there in all months
pr_na(temp$n) # only 7.9% in all 72 months, with further 8.2% with 70-71 montha

temp2 = country[, .(n=fdistinct(anyo_mes)), by = .(name_full, dataset)] # by dataset? 
tapply(temp2$n, temp2$dataset, pr_na)


# even planta >=70 months only for 18.2% of names and 13% are there for just 1 month 
# subsetting to before 2024 'improves' things just a bit - 23% planta are there 58-60 months

# # xxx is this a case of misspelled names?

temp3 = country[dataset == 'contrata', n:=fdistinct(anyo_mes), by = name_full][n==2]
temp3 %>% head

### the data appears to be unclean too - leaving single-month planta
## contracts has many names that are in there for multiple periods, they just 
## are recorded as inexplicably working on planta contract in the same role for 1 month

temp4 = country[str_detect(name_full, temp3$name_full[22])]
temp4

### if the above is true, then many of the 13% of 1-month planta contracts are probably not planta
### contracts at all, just other contracts errronously coded as such for a single (or just a couple of) entry






# '   -----------------------------------------------------------------------------------------------------------------------------------------
# MULTI-MONTH PAYMENTS  ------------------------------------------------------------------------------------------------------
#

### certain --------------------------------------------------------------------------------------------------------------------
# extract instances of (almost) certain multi-month payments
# identify combination of name-position-org-region-start_date that have >1 month entries\
vars_group = c('anyo_mes', 'name_full', 'tipo_cargo_clean', 'fecha_ingreso', 'region', 'organismo_nombre_clean')

multi_certain = country[, .N, by = vars_group][N > 1]
multi_certain = multi_certain[country, on = vars_group, nomatch = 0] # works like dplyr inner_join
setorder(multi_certain, name_full, -anyo_mes)

### save as .parqet or .qs to avoid having a file that weights >1GB
# write_flex(multi_certain, file.path('data', 'clean', 'aggregates', 'multi_payments_certain', format=format1))


### suspected -----------------------------------------------------------------------------------------------
 
# leave only people with >1 month entry
# multi_payments_suspected = country_multi

# all names in one regions with multi payments?
vars_group = c('anyo_mes', 'name_full', 'region')

multi_suspect = country[, .N, by = vars_group][N > 1]
multi_suspect = multi_suspect[country, on = vars_group, nomatch = 0] # works like dplyr inner_join

setorder(multi_suspect, name_full, -anyo_mes)

### save as .parqet or .qs to avoid having a file that weights >1GB
# write_flex(multi_suspect, file.path('data', 'clean', 'aggregates', 'multi_payments_suspect', format=format1))





# ' --------------------------------------------------------------------------------------------------------------------
# MULTI-NAME --------------------------------------------------------------------------------------------------------------------
#

# # whole data subsetted to relevant columns and cleaned in a basic way 
# country = read_flex(file.path('data','intermediate_temp', 'country_06_limpiar_id (temp1)'), format='csv')

# rows identified initially as ambigious names - to be filtered down below and re-joined to country
country_multi_all = read_flex(file.path('data','intermediate_temp', 'country_06_limpiar_id (temp1)'), format=format1 )


### < single org/region/cargo/contract dates ------------------------------------------------------------------------------------------------

### IF name occurs only in one organisation and (or?) one region -> most probably unique guy
## note: we cannot really use other variables as rank can change due to promotion and position
## and classification types are too diverse to expect consistent labelling
# temp = country %>% filter(n_org + n_region > 2)

country_multi = country_multi_all[!(n_org == 1 & n_region == 1), ]


### IF name in one profession or with one calificacion? - also probably same guy
# temp = temp %>% filter(!(n_cargo_clave == 1 & n_cargo_clave2 == 1))
country_multi = country_multi[!(n_cargo == 1), ]
country_multi = country_multi[!(n_calificacion == 1), ]


#### IF one start and end date - if one starts and ends the contract on the same dates, 
# it's likely this is one guy
country_multi = country_multi[!(n_fecha_ingreso == 1 & n_fecha_termino == 1), ]


  
### checks -> how many unique names many left?
fdistinct(country_multi$name_full)
fdistinct(country_multi$name_full)/fdistinct(country$name_full)

  
### NOTE: common names like do include clearly different individuals
  
### SO what we are left with are ~5% of names who might:
### be named the same and work different jobs (either at the same period or not)
  

### < job-changers --------------------------------------------------------------------------------------------------

### calculate monthly overlap and gaps
country_multi[, anyo_mes_n := fdistinct(anyo_mes), by = .(name_full)]
country_multi[, anyo_mes_diff_max := max_miss(anyo_mes_diff_lag), by = .(name_full)]

### IF no overlap in months or an overlap without preceding overlap -> probably same guy changing jobs
# still leaves the possibility of different guy taking up a job in different place afterwards

names1 <- country_multi[
  !(
    (anyo_mes_n == 1 & anyo_mes_diff_max <= 3) |
      (anyo_mes_n == 2 & (data.table::shift(anyo_mes_n, 1, type = "lag") == 1 | is.na(data.table::shift(anyo_mes_n, 1, type = "lag"))) &
         (data.table::shift(anyo_mes_n, 1, type = "lead") == 1 | is.na(data.table::shift(anyo_mes_n, 1, type = "lead"))))
  ),
  unique(name_full)
]

country_multi = country_multi[name_full %in% names1]


# SAME 1ST AND LAST DATES - if a person in different jobs for exactly the same duration of time
# then probably the same person too

### < duration on payroll ----------------------------------------------------------------------------- 

# add mes_max and mes_min within group
country_multi <- country_multi[, `:=`(
  mes_max = max(anyo_mes, na.rm = TRUE),
  mes_min = min(anyo_mes, na.rm = TRUE)
), by = .(name_full, organismo_codigo, region, tipo_cargo_clean)]

#  find by how much end and start date differ for each name
country_multi[, `:=`(
  mes_max_range = max(mes_max, na.rm = TRUE) - min(mes_min, na.rm = TRUE),
  mes_min_range = max(mes_min, na.rm = TRUE) - min(mes_min, na.rm = TRUE)
), by = name_full]

# filter based on range thresholds
country_multi <- country_multi[mes_max_range > 2 & mes_min_range > 2] # if differences of less than e.g. 2 months (62 days), then same person moonlighting



  
### + person_id (initial) ---------------------------------------------------------------------------------------------------------------------------
# those names that are left might or might not be the same person - for now let's treat everyone 
# uniquely given their organismo, region, contract, position, and qualifications - below we differentiate further
setorder(country_multi, name_full, anyo_mes)


### ID based on name + key job characteristics for those still not left in 'country_multi'

## doing it across the full set of job characteristics produces 379k unique ID out of just 74k names
## which appears excessive (uniqe ID = 14% of all rows, while for names NOT in country_multi, so assumed to
## be unique the same ratio is ~3%)

## tipo_calification with 197k unique entries might be too unique, just as tipo_cargo_clean with 223k uniue entries
# country_multi[, person_id_strict := .GRP,
#             by = .(name_full, organismo_codigo, region, dataset, tipo_cargo_clean, tipo_calificacion)]

fdistinct(country$tipo_cargo_clean)
# using just name and organization - more sensible 174k IDs, 6.7% of all rows (up to 218k and 8.4% if clean rank also used)
country_multi[, person_id := .GRP, by = .(name_full, organismo_codigo, region)]
country_multi[, person_id := .GRP, by = .(name_full, organismo_codigo, tipo_estamento_comprimido, region)]

# using just name and region yieds 91k IDs so 3.5% of all rows, which is very close the the % in the subset of data with unique names
country_multi[, person_id := .GRP, by = .(name_full, region)]



### ID based on just the name for everyone NOT in 'country_multi'
country_id = country[!row_id_org %in% country_multi$row_id_org, .(row_id_org, region, name_full)]
country_id[, person_id := .GRP, by = .(name_full)]



### (*)checks - calculate unique names, IDs and shares
fdistinct(country_multi$name_full)
fdistinct(country_multi$name_full)/nrow(country_multi)
fdistinct(country_multi$person_id)
fdistinct(country_multi$person_id)/nrow(country_multi)

fdistinct(country_id$person_id)/nrow(country_id) # and unique ID to row number ratio in the subset of data with unique names?

# region as an additional unique classifier?
temp = country_id[, .(n=fdistinct(region)), by=name_full] # generally 96.6% of people in just 1 region - mobility very low
temp = country_multi[, .(n=fdistinct(region)), by=name_full] # in country_multi 'only' 79% in 1 region, suggesting that we capture many multi-month
# payments that are given to same-named individuals in different regions in all likelihood, as normally having >1 region doesn't happen too often
pr_na(temp$n)

### combine (just person_id, name_full, and row_id_org)
country_id = rbindlist(list(
  country_id[,.(row_id_org, person_id, name_full)],
  country_multi[,.(row_id_org, person_id, name_full)]
))



  

### combine with full --------------------------------------------------------------------------------------------------------------------

country_save = read_flex(file.path(main_dir, 'data', 'intermediate', "country_05_limpiar_puesto"), format = format1)

setindex(country_save, row_id_org)
setindex(country_id, row_id_org)
country_save = country_id[country_save, on = 'row_id_org']

# names(country_save %>% select(-c(id, name_full_original, tipo_estamento, 
#                                matches('i\\.'))))

country_save = country_save %>% select(-c(id, name_full_original, tipo_estamento, 
                                      matches('i\\.')))


### save -------------------------------------------------------------------------------------------------------
rm(list = setdiff(ls(), "country_save"))
write_flex(country_save, file.path('data', 'intermediate', 'country_06_limpiar_id'), format=format1)


### (*) checks
country_save[, fmean(pago_bruto[genero == 'hombre'])/fmean(pago_bruto[genero == 'mujer']), by = .(anyo)]

# checks > gender directivo gap (women should be ~40-44% of directivos)
country_save[,uniqueN(person_id[tipo_estamento_comprimido == "directivo" &  genero == 'mujer'])/
             uniqueN(person_id[tipo_estamento_comprimido == "directivo"]),
           by = .(anyo)]

gc()
beep()
exec_time_fun('exec_time')


#
# FIN DEL CODIGO  --------------------------------------------------------------------------------------------
# 